# Clear workspace
rm(list=ls()); graphics.off()
### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(skimr) # For nice data summaries
The dataset comprises of three main tables:
listings - Detailed listings data showing 96 atttributes for each of the listings. Some of the attributes which are intuitivly interesting are: price (continuous), longitude (continuous), latitude (continuous), listing_type (categorical), is_superhost (categorical), neighbourhood (categorical), ratings (continuous) among others.reviews - Detailed reviews given by the guests with 6 attributes. Key attributes include date (datetime), listing_id (discrete), reviewer_id (discrete) and comment (textual).calendar - Provides details about booking for the next year by listing. Four attributes in total including listing_id (discrete), date (datetime), available (categorical) and price (continuous).listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz')
listings %>% glimpse()
Rows: 28,523
Columns: 106
$ id <dbl> 6983, 26057, 26473, 29118, 29618, 310…
$ listing_url <chr> "https://www.airbnb.com/rooms/6983", …
$ scrape_id <dbl> 20200626200423, 20200626200423, 20200…
$ last_scraped <date> 2020-06-28, 2020-06-28, 2020-06-28, …
$ name <chr> "Copenhagen 'N Livin'", "Lovely house…
$ summary <chr> "Lovely apartment located in the hip …
$ space <chr> "Beautiful and cosy apartment conveni…
$ description <chr> "Lovely apartment located in the hip …
$ experiences_offered <chr> "none", "none", "none", "none", "none…
$ neighborhood_overview <chr> "Nice bars and cozy cafes just minute…
$ notes <chr> NA, NA, NA, NA, "Please note that the…
$ transit <chr> "Bus 66 runs to the central station. …
$ access <chr> "Bedroom, living room, kitchen, and b…
$ interaction <chr> "We are usually at work during day ti…
$ house_rules <chr> "No smoking allowed! No pets.", "We w…
$ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ medium_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ picture_url <chr> "https://a0.muscache.com/im/pictures/…
$ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ host_id <dbl> 16774, 109777, 112210, 125230, 127577…
$ host_url <chr> "https://www.airbnb.com/users/show/16…
$ host_name <chr> "Simon", "Kari", "Oliver", "Nana", "S…
$ host_since <date> 2009-05-12, 2010-04-17, 2010-04-22, …
$ host_location <chr> "Copenhagen, Capital Region of Denmar…
$ host_about <chr> "I'm currently working as an environm…
$ host_response_time <chr> "N/A", "N/A", "within a few hours", "…
$ host_response_rate <chr> "N/A", "N/A", "100%", "N/A", "N/A", "…
$ host_acceptance_rate <chr> "33%", "19%", "100%", "17%", "N/A", "…
$ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ host_thumbnail_url <chr> "https://a0.muscache.com/im/users/167…
$ host_picture_url <chr> "https://a0.muscache.com/im/users/167…
$ host_neighbourhood <chr> "Nørrebro", "Indre By", "Indre By", "…
$ host_listings_count <dbl> 1, 1, 4, 1, 1, 1, 3, 1, 0, 2, 1, 1, 2…
$ host_total_listings_count <dbl> 1, 1, 4, 1, 1, 1, 3, 1, 0, 2, 1, 1, 2…
$ host_verifications <chr> "['email', 'phone', 'reviews']", "['e…
$ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ host_identity_verified <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, FALS…
$ street <chr> "Copenhagen, Hovedstaden, Denmark", "…
$ neighbourhood <chr> "Nørrebro", "Indre By", "Indre By", "…
$ neighbourhood_cleansed <chr> "Nrrebro", "Indre By", "Indre By", "V…
$ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ city <chr> "Copenhagen", "Copenhagen", "Copenhag…
$ state <chr> "Hovedstaden", "Hovedstaden", "Hoveds…
$ zipcode <chr> "2200", "2100", "1210", "1650", "2100…
$ market <chr> "Copenhagen", "Copenhagen", "Copenhag…
$ smart_location <chr> "Copenhagen, Denmark", "Copenhagen, D…
$ country_code <chr> "DK", "DK", "DK", "DK", "DK", "DK", "…
$ country <chr> "Denmark", "Denmark", "Denmark", "Den…
$ latitude <dbl> 55.68798, 55.69163, 55.67590, 55.6706…
$ longitude <dbl> 12.54571, 12.57459, 12.57698, 12.5543…
$ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ property_type <chr> "Apartment", "House", "House", "Apart…
$ room_type <chr> "Private room", "Entire home/apt", "E…
$ accommodates <dbl> 2, 6, 12, 2, 4, 3, 3, 4, 5, 2, 2, 2, …
$ bathrooms <dbl> 1.0, 1.5, 2.5, 1.0, 1.0, 1.0, 2.0, 1.…
$ bedrooms <dbl> 1, 4, 6, 1, 3, 1, 1, 2, 2, 1, 1, 1, 1…
$ beds <dbl> 1, 4, 7, 1, 3, 3, 2, 2, 1, 1, 0, 1, 1…
$ bed_type <chr> "Real Bed", "Real Bed", "Real Bed", "…
$ amenities <chr> "{TV,\"Cable TV\",Wifi,Kitchen,\"Paid…
$ square_feet <dbl> 97, NA, NA, NA, NA, 689, NA, 807, NA,…
$ price <chr> "$365.00", "$2,398.00", "$3,096.00", …
$ weekly_price <chr> NA, NA, "$17,513.00", NA, "$2,981.00"…
$ monthly_price <chr> NA, NA, "$67,073.00", NA, "$8,943.00"…
$ security_deposit <chr> "$0.00", "$5,000.00", "$3,726.00", NA…
$ cleaning_fee <chr> "$33.00", "$1,100.00", "$522.00", "$3…
$ guests_included <dbl> 1, 3, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2…
$ extra_people <chr> "$66.00", "$350.00", "$0.00", "$0.00"…
$ minimum_nights <dbl> 2, 3, 3, 7, 7, 2, 3, 6, 5, 30, 1, 3, …
$ maximum_nights <dbl> 15, 30, 31, 14, 31, 10, 365, 1125, 21…
$ minimum_minimum_nights <dbl> 2, 3, 3, 3, 7, 2, 3, 6, 5, 30, 1, 3, …
$ maximum_minimum_nights <dbl> 2, 3, 3, 5, 7, 2, 3, 6, 5, 30, 1, 3, …
$ minimum_maximum_nights <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 112…
$ maximum_maximum_nights <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 112…
$ minimum_nights_avg_ntm <dbl> 2.0, 3.0, 3.0, 4.1, 7.0, 2.0, 3.0, 6.…
$ maximum_nights_avg_ntm <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 112…
$ calendar_updated <chr> "5 months ago", "4 months ago", "7 mo…
$ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ availability_30 <dbl> 29, 28, 29, 21, 0, 0, 8, 0, 11, 0, 0,…
$ availability_60 <dbl> 59, 58, 59, 21, 0, 0, 8, 0, 24, 0, 0,…
$ availability_90 <dbl> 89, 88, 89, 21, 0, 0, 8, 5, 24, 26, 0…
$ availability_365 <dbl> 89, 363, 172, 21, 0, 58, 8, 189, 24, …
$ calendar_last_scraped <date> 2020-06-28, 2020-06-28, 2020-06-28, …
$ number_of_reviews <dbl> 168, 50, 293, 22, 90, 17, 73, 7, 40, …
$ number_of_reviews_ltm <dbl> 1, 4, 31, 2, 0, 0, 1, 0, 0, 1, 11, 1,…
$ first_review <date> 2009-09-04, 2013-12-02, 2010-10-14, …
$ last_review <date> 2019-07-19, 2019-12-14, 2020-03-02, …
$ review_scores_rating <dbl> 96, 98, 91, 98, 94, 97, 98, 91, 97, 8…
$ review_scores_accuracy <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 9…
$ review_scores_cleanliness <dbl> 9, 10, 9, 10, 9, 10, 10, 9, 9, 8, 8, …
$ review_scores_checkin <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
$ review_scores_communication <dbl> 10, 10, 10, 10, 9, 10, 10, 10, 10, 9,…
$ review_scores_location <dbl> 9, 10, 10, 10, 10, 10, 10, 9, 9, 10, …
$ review_scores_value <dbl> 9, 10, 9, 10, 9, 9, 9, 9, 10, 9, 9, 9…
$ requires_license <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ license <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ instant_bookable <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FAL…
$ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ cancellation_policy <chr> "moderate", "moderate", "moderate", "…
$ require_guest_profile_picture <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ calculated_host_listings_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1…
$ calculated_host_listings_count_entire_homes <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1…
$ calculated_host_listings_count_private_rooms <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ reviews_per_month <dbl> 1.28, 0.62, 2.48, 0.18, 0.75, 0.14, 0…
#calendar <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/calendar.csv.gz')
#calendar %>% glimpse()
#reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/reviews.csv.gz')
#reviews %>% glimpse()
# # And the summary plus geodata
# summaries_listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/listings.csv')
# summaries_reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/reviews.csv')
# summaries_neighbourhoods <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/neighbourhoods.csv')
# The geodat of the hoods comes as a geojson, so we need the right package to load it
library(geojsonio)
neighbourhoods_geojson <- geojson_read( 'http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/neighbourhoods.geojson', what = "sp")
listings %>%
count(host_id, sort = TRUE)
listings %>%
filter(host_id == 187610263) %>%
count(neighbourhood_cleansed, sort = TRUE)
listings %<>%
mutate(price = price %>% parse_number(),
price_sqf = price / square_feet)
listings %<>%
group_by(host_id) %>%
mutate(host_professional = n() >= 5) %>%
ungroup()
listings %>%
group_by(host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE))
listings %>%
group_by(neighbourhood_cleansed, host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE)) %>%
pivot_wider(names_from = host_professional, values_from = review)
listings %<>%
mutate(desc_lenght = description %>% str_count('\\w+')) %>%
mutate(desc_long = percent_rank(desc_lenght) > 0.9 )
listings %>%
group_by(desc_long) %>%
summarise(review = review_scores_rating %>% mean(na.rm =TRUE))
listings %>% skim()
listings %<>%
mutate(across(is_character, ~ifelse(.x == "", NA, .x)))
library(VIM)
listings %>%
select(host_is_superhost, review_scores_rating, host_response_time, name, host_since,zipcode) %>%
aggr(numbers = TRUE, prop = c(TRUE, FALSE))
listings %<>%
mutate(party_place = accommodates >= 10)
listings %>%
filter(party_place == TRUE) %>%
group_by(neighbourhood_cleansed) %>%
summarize(n = n(),
review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE) ) %>%
arrange(desc(n))
library(leaflet)
listings %>% leaflet() %>%
addTiles() %>%
addMarkers(~longitude, ~latitude,
labelOptions = labelOptions(noHide = F),
clusterOptions = markerClusterOptions(),
popup = paste0("<b> Name: </b>", listings$name,
"<br/><b> Host Name: </b>", listings$host_name,
"<br> <b> Price: </b>", listings$price,
"<br/><b> Room Type: </b>", listings$room_type,
"<br/><b> Property Type: </b>", listings$property_type
)) %>%
# setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron")
# I need to fortify the data AND keep trace of the commune code! (Takes ~2 minutes)
library(broom)
neighbourhoods_tidy <- neighbourhoods_geojson %>%
tidy(region = "neighbourhood")
neighbourhoods_tidy %>% glimpse()
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon() +
theme_void() +
coord_map()
neighborhood_agg <- listings %>%
group_by(neighbourhood_cleansed) %>%
summarise(n = n(),
price_mean = price %>% mean(na.rm = TRUE),
review_mean = review_scores_rating %>% mean(na.rm = TRUE))
neighbourhoods_tidy %<>%
left_join(neighborhood_agg, by = c('id' = 'neighbourhood_cleansed'))
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = n)) +
geom_polygon() +
theme_void() +
coord_map()